#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project ：academic_trend_analysis
@File    ：config.py
@IDE     ：PyCharm
@Author  ：iyoahs
@Date    ：2025/6/20 11:25
@Describe：项目配置文件
'''
import os

# 数据采集配置
TOPICS = [
    "machine learning", "NLP", "computer vision", "reinforcement learning",
    "AI for science", "cs.LG", "cs.AI", "cs.CL", "cs.CV"
]
HOURS = 72  # 抓取最近多少小时的数据
MAX_RESULTS = 1000  # 每次抓取的最大论文数量
OUTPUT_DIR = "output"  # 本地输出目录
LAST_INFO = "data/last_info.json"

# MongoDB配置
MONGO_CONFIG = {
    "uri": os.getenv("MONGO_URI", "mongodb://localhost:27017"),
    "db_name": "arxiv_db",
    "collection_name": "papers",
}

# LLM配置
LLM_CONFIG = {
    "model_name": "deepseek-chat",
    "api_key": "sk-f9e9a6c2963a4cdea6aa9a9afff1ab95",
    "base_url": "https://api.deepseek.com",
    "max_tokens": 2000
}

# Spark配置
SPARK_CONFIG = {
    "app_name": "AcademicTrendAnalysis",
    "master": "local[*]",
    "spark.mongodb.input.uri": os.getenv("MONGO_URI", "mongodb://localhost:27017/arxiv_db.papers"),
    "spark.mongodb.output.uri": os.getenv("MONGO_URI", "mongodb://localhost:27017/arxiv_db.papers"),
    "spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1",
    "spark.executor.memory": "4g",
    "spark.driver.memory": "4g",
    "spark.sql.shuffle.partitions": "8"
}

# 关键词分析配置
KEYWORD_ANALYSIS_CONFIG = {
    "match_threshold": 85,
    "special_char_map": {
        '-': [' ', '_', ''],
        '_': [' ', '-', ''],
        ' ': ['-', '_', '']
    }
}

# Streamlit配置
STREAMLIT_CONFIG = {
    "page_title": "AI Research Trend Analyzer",
    "page_icon": "📊",
    "layout": "wide",
    "initial_sidebar_state": "expanded"
}